Here is the economic data, exclude some unnecessary columns as the requirements. We use scale() to make the data dimensionless and standard Gaussian distributed.
head(data_rescaled)
## Food.Costs... Womens.Clothing... Clothing.Index Hours.Worked
## Amsterdam -0.45035658 0.48363574 0.57332103 -0.8405372
## Athens -0.25356211 0.25529624 0.61804095 -0.4843573
## Auckland 0.55632284 -0.01109984 -0.25794331 -0.3248738
## Bangkok -0.01135353 -0.62000516 -0.65253082 2.1205400
## Barcelona -0.22328604 0.06501333 0.53123170 -0.8086405
## Beijing 0.29897622 0.36946599 -0.03960489 0.3502730
## Wage.Gross Vacation.Days COL..Excl..rent. Bread.kg.in.min.
## Amsterdam 0.8273374 0.56575346 0.4307124 -0.89634286
## Athens -0.2717526 0.40968354 -0.1602371 -0.37936006
## Auckland 0.2763031 -0.05852622 0.4144478 -0.03470486
## Bangkok -1.0700076 -2.08743517 -0.7457650 0.74076933
## Barcelona 0.2703460 1.34610306 0.3060167 -0.46552386
## Beijing -0.9985220 -1.77529534 -0.4746873 0.91309693
## Rice.kg.in.min. Goods.and.Services... Good.and.Services.Index
## Amsterdam -0.77072258 0.4327796 0.4307124
## Athens 1.18241290 -0.1577707 -0.1602371
## Auckland -0.88561290 0.4121309 0.4144478
## Bangkok 0.49307097 -0.7455678 -0.7457650
## Barcelona -1.11539355 0.3047582 0.3060167
## Beijing 0.03350968 -0.4743827 -0.4746873
plot_ly(x=colnames(data_rescaled), y=rownames(data_rescaled),
z=data_rescaled, type="heatmap", colors =colorRamp(c("yellow", "red")))%>%
layout(title = "Heatmap of economic data")
It is hard to see clusters and outliers, since no regular pattern can be seen by such heatmap.
rowdist<-dist(data_rescaled,method = "minkowski", p=2)
coldist<-dist(t(data_rescaled),method = "minkowski", p=2)
order1<-seriate(rowdist, "GW")
order2<-seriate(coldist, "GW")
ord1<-get_order(order1)
ord2<-get_order(order2)
result <- criterion(rowdist, order = order1,
method = c("Path_length","Gradient_raw"))
result <- rbind(result,criterion(coldist, order = order2,
method = c("Path_length","Gradient_raw")))
data_rescaled0 <- data_rescaled[rev(ord1),ord2]
plot_ly(x=colnames(data_rescaled0), y=rownames(data_rescaled0), z=data_rescaled0,
type="heatmap", colors =colorRamp(c("yellow", "red")))%>%
layout(title = "Heatmap by Euclidean distance order")
coldist<-as.dist(1-cor(data_rescaled))
rowdist<-as.dist(1-cor(t(data_rescaled)))
order1<-seriate(rowdist, "GW")
order2<-seriate(coldist, "GW")
ord1<-get_order(order1)
ord2<-get_order(order2)
data_rescaled0 <- data_rescaled[rev(ord1),ord2]
plot_ly(x=colnames(data_rescaled0), y=rownames(data_rescaled0), z=data_rescaled0,
type="heatmap", colors =colorRamp(c("yellow", "red")))%>%
layout(title = "Heatmap by one minus correlation order")
Evidently, the first plot is much easier to cluster since the color brightness changes gradually. According to such plot, there is an obvious separation from Tokyo to around Paris and from Ljubljana to Delhi (this is still a rough visual clustering). The cities on the top part have relatively high wage gross and high costs in food, clothing, goods, services, etc. On contrary, the cities on the bottom part have high values in Bread/rice.kg.in.min and work hours compared with the ones on the top.
In this bottom part, it can also divided into two clusters. The first one includes cities from Doha to Delhi, which have the higher hours of work and a lower number of vacation days compared with the other cities in the bottom part. So, there are three clusters in total. Additionally, such cities as Caracas (high Bread.kg.in.min.), Manila (high Bread.kg.in.min.) and Ljubljana (high Rice.kg.in.min.) are the most obvious outliers in their groups.
rowdist<-dist(data_rescaled,method = "minkowski", p=2)
coldist<-dist(t(data_rescaled),method = "minkowski", p=2)
order1<-seriate(rowdist, "TSP")
order2<-seriate(coldist, "TSP")
ord1<-get_order(order1)
ord2<-get_order(order2)
result <- rbind(result,criterion(rowdist, order = order1,
method = c("Path_length","Gradient_raw")))
result <- rbind(result,criterion(coldist, order = order2,
method = c("Path_length","Gradient_raw")))
data_rescaled0<-data_rescaled[ord1,ord2]
plot_ly(x=colnames(data_rescaled0), y=rownames(data_rescaled0),
z=data_rescaled0, type="heatmap", colors =colorRamp(c("yellow", "red")))%>% layout(title = "Heatmap with TSP solver")
rownames(result) <- c("Row HC","Col HC","Row TSP","Col TSP")
result
## Path_length Gradient_raw
## Row HC 137.63293 59968
## Col HC 59.34424 203
## Row TSP 122.40544 37120
## Col TSP 59.18851 167
We create a new heatmap which reorders Euclidean distances by TSP solver. According to the criterion() to TSP heatmap and HC heatmap, the TSP heatmap seems to be a better plot since it has lower objective function values for both row and column orders built on the current heatmap. Furthermore, the color of TSP heatmap change more smoothly than of the HC heatmap.
ord=1:11
dims0=list()
for( i in 1:ncol(data_rescaled)){
dims0[[i]]=list( label=colnames(data_rescaled)[ord[i]],
values=as.formula(paste("~",colnames(data_rescaled)[ord[i]])))
}
p <- as.data.frame(data_rescaled) %>%
plot_ly(type = 'parcoords',
line = list(color = ~as.numeric(Wage.Gross)),
dimensions = dims0
)%>%layout(title = "Plots from unsorted data")
ggplotly(p)
color <- data_rescaled[,"Wage.Gross"]>0
color[which(rownames(data) == "Caracas")] = 2
data_rescaled <- cbind(data_rescaled,color)
ord=c(8,9,4,2,5,1,3,7,10,11,6)
# ord<-Order
dims0=list()
for( i in 1:ncol(data_rescaled)){
dims0[[i]]=list( label=colnames(data)[ord[i]],
values=as.formula(paste("~",colnames(data)[ord[i]])))
}
p <- as.data.frame(data_rescaled) %>%
plot_ly(type = 'parcoords',
line = list(color = ~color),
dimensions = dims0
)%>%layout(title = "Plots from sorted data")
ggplotly(p)
We think that “Wage.Gross” is the most important variable to define these clustering. According to the rescaled data, all cities with purple paths have high rescaled Wage.Gross (>0) and others have low rescaled Wage.Gross (<=0). Higher wage gross might represent stronger abilities in cost of food, goods, services, clothing, etc. It is also possibly connected with longer vacation days and less hours worked.
In addition, city Caracas is an obvious outlier in our clusters since it has a low rescaled Wage.Gross but high values in cost of Goods & Services, which is similar with most cities with high rescaled Wage.Gross.
For instance, Tokyo, Montreal, Chicago, Los Angeles, Miami, New York are in one cluster. Paris, Helsinki, Lyon, Frankturt, Amsterdam, Copenhagen are in another cluster. The main difference between such two clusters is citizens in the second cluster have shorter working hours compared with that in the first cluster.
Delhi is the most distinct outlier, since its radar plot is the most narrow and only with high values in Hours Worked, Rice.kg.in.min.
The parallel plot is the best, because it is efficient to show the connection among different features of our data.
On the other hand, radar plots can show us characteristics of each city easily and quickly. It is necessary when make analysis in details for each city.
Here is the scatter plot draw by ggplot2:
p2.1.1 <- ggplot(adult, aes(x=age, y=workHours,colour = income)) + geom_point()+
scale_color_manual(values = c("red", "black"))
p2.1.1
The problem of this plot is data of two income groups is mixed. It lead to hard to recognize the shape of each group when analysis the data. Now, let’s see the trellis plot (condition on Income Level)
p2.1.2 <- p2.1.1 + facet_grid(income~.)
p2.1.2
Now, the graphs are more clearly. Most of the rich people (income>50k) are more than 24 years old. They also don’t work too much or too litter ( not more than 80 hours per week or less than 20 hours).
On the other hand, the graph of poor people is understandable. People those under 20 maybe don’t work, which mean that they don’t have any income. About one third only work under 25 hours per week, maybe they just have a part-time work. others may have a low salary, so they need to work a lot to cover the live expend. It leads to the range of this plot is higher than the plot of rich people.
Here is the density plot draw by ggplot2:
p2.2.1 <- ggplot()+
geom_density(data=adult, aes(x=age, group=income, fill=income),alpha=0.5, adjust=2) +
xlab("Age") +
ylab("Density")
p2.2.1
In the graph drawn by ggplot2, we can see that the plot of rich people (income >50k) look like the normal distribution, while the plot of (relatively) poor people (income <50k) is asymmetric to the left. And, the trellis plots can explain more detailed information.
And now, trellis plot condition on Marital Status:
p2.2.2 <- p2.2.1 + facet_wrap(marital~.)
p2.2.2
The main difference is for people who never-married and Married-spouse-absent. Most of the poor people in this group are young, while the rich are older. Maybe the rich don’t spend too much time for their family and focus on their career, which makes them become richer. To be more specific, not so many poor people are still never-married after around 30 years old. It might mean that their marital status are changed, most of them are for example married after 25 years old.
The density plot for other marital status is approximately similar between the two groups.
Let start with the 3D plot:
adult_filted <- adult[adult$capitalLoss >0,]
p2.3.1 <- plot_ly(adult_filted, x = ~educationNum, y = ~age, z = ~capitalLoss) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Education-num'),
yaxis = list(title = 'Age'),
zaxis = list(title = 'Capital-Loss')))
p2.3.1
This plot is difficult to analyze because we can’t see some clear patterns and outliers.
p2.3.2 <- ggplot(data = adult_filted, aes(x = capitalLoss, y = educationNum))+
stat_density_2d(geom = "raster", aes(fill = stat(density)), contour = FALSE)+
facet_wrap(cut_number(adult_filted$age, 6)~.)
p2.3.2
The capital Loss is the similar for all people, regardless the age and the education number (of course the old people have higher capital Loss than young people, but not too much). Additionally, most of people in the youngest group are educated around level 10. With the growth of age, few people are educated lower than level 8, and the populations of high (>11) and middle (8-11) educated are quite similar.
p2.4.1 <- ggplot()+
geom_point(data = adult_filted, aes(x = capitalLoss, y = educationNum))+
facet_wrap(cut_number(adult_filted$age, 4)~.)
p2.4.1
#b
Agerange <- lattice::equal.count(adult_filted$age, number=4, overlap=0.10)
L<-matrix(unlist(levels(Agerange)), ncol=2, byrow = T)
L1<-data.frame(Lower=L[,1],Upper=L[,2], Interval=factor(1:nrow(L)))
#Plot of Interval
#ggplot(L1)+geom_linerange(aes(ymin = Lower, ymax = Upper, x=Interval))
index=c()
Class=c()
for(i in 1:nrow(L)){
Cl=paste("[", L1$Lower[i], ",", L1$Upper[i], "]", sep="")
ind=which(adult_filted$age>=L1$Lower[i] &adult_filted$age<=L1$Upper[i])
index=c(index,ind)
Class=c(Class, rep(Cl, length(ind)))
}
df<-adult_filted[index,]
df$Class<-as.factor(Class)
ggplot(df, aes(x=capitalLoss, y=educationNum))+
geom_point()+
facet_wrap(~Class, labeller = "label_both")
We can see that both plots look very similar, but still have some small differences. When using Shingles, because of the overlap in range age, we can see some points in both two graph. For example, the lower point in the right of the first and second graphs.
In our opinions, Shingles help us see the transformation of the pattern (depend on the percentage of overlap) the disadvantage is some points that present in both plots can mislead us (especially the outliers).
Please put the code button to see the code of this report.
library(plotly)
library(ggplot2)
library(seriation)
library(dplyr)
library(scales)
set.seed(15)
data <- read.table("prices-and-earnings.txt", fill = TRUE, sep = "\t", header = TRUE, row.names = "City")
data <- data[,c(1,2,5,6,7,9,10,16,17,18,19)]
data_rescaled <- scale(data,center = TRUE, scale=TRUE)
head(data_rescaled)
plot_ly(x=colnames(data_rescaled), y=rownames(data_rescaled),
z=data_rescaled, type="heatmap", colors =colorRamp(c("yellow", "red")))%>%
layout(title = "Heatmap of economic data")
rowdist<-dist(data_rescaled,method = "minkowski", p=2)
coldist<-dist(t(data_rescaled),method = "minkowski", p=2)
order1<-seriate(rowdist, "GW")
order2<-seriate(coldist, "GW")
ord1<-get_order(order1)
ord2<-get_order(order2)
result <- criterion(rowdist, order = order1,
method = c("Path_length","Gradient_raw"))
result <- rbind(result,criterion(coldist, order = order2,
method = c("Path_length","Gradient_raw")))
data_rescaled0 <- data_rescaled[rev(ord1),ord2]
plot_ly(x=colnames(data_rescaled0), y=rownames(data_rescaled0), z=data_rescaled0,
type="heatmap", colors =colorRamp(c("yellow", "red")))%>%
layout(title = "Heatmap by Euclidean distance order")
coldist<-as.dist(1-cor(data_rescaled))
rowdist<-as.dist(1-cor(t(data_rescaled)))
order1<-seriate(rowdist, "GW")
order2<-seriate(coldist, "GW")
ord1<-get_order(order1)
ord2<-get_order(order2)
data_rescaled0 <- data_rescaled[rev(ord1),ord2]
plot_ly(x=colnames(data_rescaled0), y=rownames(data_rescaled0), z=data_rescaled0,
type="heatmap", colors =colorRamp(c("yellow", "red")))%>%
layout(title = "Heatmap by one minus correlation order")
rowdist<-dist(data_rescaled,method = "minkowski", p=2)
coldist<-dist(t(data_rescaled),method = "minkowski", p=2)
order1<-seriate(rowdist, "TSP")
order2<-seriate(coldist, "TSP")
ord1<-get_order(order1)
ord2<-get_order(order2)
result <- rbind(result,criterion(rowdist, order = order1,
method = c("Path_length","Gradient_raw")))
result <- rbind(result,criterion(coldist, order = order2,
method = c("Path_length","Gradient_raw")))
data_rescaled0<-data_rescaled[ord1,ord2]
plot_ly(x=colnames(data_rescaled0), y=rownames(data_rescaled0),
z=data_rescaled0, type="heatmap", colors =colorRamp(c("yellow", "red")))%>% layout(title = "Heatmap with TSP solver")
rownames(result) <- c("Row HC","Col HC","Row TSP","Col TSP")
result
ord=1:11
dims0=list()
for( i in 1:ncol(data_rescaled)){
dims0[[i]]=list( label=colnames(data_rescaled)[ord[i]],
values=as.formula(paste("~",colnames(data_rescaled)[ord[i]])))
}
p <- as.data.frame(data_rescaled) %>%
plot_ly(type = 'parcoords',
line = list(color = ~as.numeric(Wage.Gross)),
dimensions = dims0
)%>%layout(title = "Plots from unsorted data")
ggplotly(p)
color <- data_rescaled[,"Wage.Gross"]>0
color[which(rownames(data) == "Caracas")] = 2
data_rescaled <- cbind(data_rescaled,color)
ord=c(8,9,4,2,5,1,3,7,10,11,6)
# ord<-Order
dims0=list()
for( i in 1:ncol(data_rescaled)){
dims0[[i]]=list( label=colnames(data)[ord[i]],
values=as.formula(paste("~",colnames(data)[ord[i]])))
}
p <- as.data.frame(data_rescaled) %>%
plot_ly(type = 'parcoords',
line = list(color = ~color),
dimensions = dims0
)%>%layout(title = "Plots from sorted data")
ggplotly(p)
Ps=list()
nPlot=72
data[ord1,] %>%
add_rownames( var = "group" ) %>%
mutate_each(funs(rescale), -group) -> pe_radar
for (i in 1:nPlot){
Ps[[i]] <- htmltools::tags$div(
plot_ly(type = 'scatterpolar',
r=as.numeric(pe_radar[i,-1]),
theta= colnames(pe_radar)[-1],
fill="toself")%>%
layout(title=pe_radar$group[i]), style="width: 20%;") # 4 plots per row
}
h <-htmltools::tags$div(style = "display: flex; flex-wrap: wrap", Ps)
p <- htmltools::browsable(h)
p
# lab_path <- "C:/Users/Duong Minh Duc/Desktop/adult.csv"
lab_path <- "adult.csv"
adult <- read.csv(file = lab_path)
names(adult) <- c("age", "workClass", "popuIndex", "education", "educationNum", "marital", "occupation", "relationship", "race", "sex", "capitalGain", "capitalLoss", "workHours", "country", "income")
p2.1.1 <- ggplot(adult, aes(x=age, y=workHours,colour = income)) + geom_point()+
scale_color_manual(values = c("red", "black"))
p2.1.1
p2.1.2 <- p2.1.1 + facet_grid(income~.)
p2.1.2
p2.2.1 <- ggplot()+
geom_density(data=adult, aes(x=age, group=income, fill=income),alpha=0.5, adjust=2) +
xlab("Age") +
ylab("Density")
p2.2.1
p2.2.2 <- p2.2.1 + facet_wrap(marital~.)
p2.2.2
adult_filted <- adult[adult$capitalLoss >0,]
p2.3.1 <- plot_ly(adult_filted, x = ~educationNum, y = ~age, z = ~capitalLoss) %>%
add_markers() %>%
layout(scene = list(xaxis = list(title = 'Education-num'),
yaxis = list(title = 'Age'),
zaxis = list(title = 'Capital-Loss')))
p2.3.1
p2.3.2 <- ggplot(data = adult_filted, aes(x = capitalLoss, y = educationNum))+
stat_density_2d(geom = "raster", aes(fill = stat(density)), contour = FALSE)+
facet_wrap(cut_number(adult_filted$age, 6)~.)
p2.3.2
p2.4.1 <- ggplot()+
geom_point(data = adult_filted, aes(x = capitalLoss, y = educationNum))+
facet_wrap(cut_number(adult_filted$age, 4)~.)
p2.4.1
#b
Agerange <- lattice::equal.count(adult_filted$age, number=4, overlap=0.10)
L<-matrix(unlist(levels(Agerange)), ncol=2, byrow = T)
L1<-data.frame(Lower=L[,1],Upper=L[,2], Interval=factor(1:nrow(L)))
#Plot of Interval
#ggplot(L1)+geom_linerange(aes(ymin = Lower, ymax = Upper, x=Interval))
index=c()
Class=c()
for(i in 1:nrow(L)){
Cl=paste("[", L1$Lower[i], ",", L1$Upper[i], "]", sep="")
ind=which(adult_filted$age>=L1$Lower[i] &adult_filted$age<=L1$Upper[i])
index=c(index,ind)
Class=c(Class, rep(Cl, length(ind)))
}
df<-adult_filted[index,]
df$Class<-as.factor(Class)
ggplot(df, aes(x=capitalLoss, y=educationNum))+
geom_point()+
facet_wrap(~Class, labeller = "label_both")